/*

 Program cleanimss.do does basic cleaning and makes new variables
 from IMSS data.

 Kumler, Verhoogen, Frias "Enlisting Employees ..." REStat forthcoming

*/


****************************** housekeeping *************************************;

#delimit;
set more off;

do ${code}housekeeping.do;

**** prepare auxiliary datasets ********;

tempfile d1;
use ${work}imssdir_zona;
keep registro ent mpio_imss frac zona;
save `d1', replace;

tempfile d2;
use ${work}minwages_wtopcodes;

* minwages_wtopcodes.dta includes cpi;

save `d2', replace;

tempfile d3;
use ${work}imss_inegi_mpio_codes;
keep if mpio_imss~="";
keep mpio_imss entmpio_inegi;
sort mpio_imss;
save `d3', replace;


*********************** initial cleaning of imss datasets ********************;

*loop over years 1985-2005;
*loop over quarters (months 3, 6, 9, 12);

forval yr = 1985/2005
{;
 forval month = 3(3)12
 {;

     display "year = `yr' month = `month'";

     clear;
     use ${work}compdata`yr'`month'30;

     *create qtr variable;
     gen byte qtr=.;
     replace qtr=1 if `month'==3;
     replace qtr=2 if `month'==6;
     replace qtr=3 if `month'==9;
     replace qtr=4 if `month'==12;

     local qtr = `month'/3;

     *create male indicator;
     tab sexo, missing;
     gen byte male=.;
     replace male=1 if sexo==1;
     replace male=0 if sexo==2;
     drop sexo;

     qui sum compdata;
     local cut_`yr'`qtr'1 = r(N);

     *drop obs in registros with 9999 in them, which are not
     *real establishments;

     drop if substr(registro,4,4)=="9999";

     qui sum compdata;
     local cut_`yr'`qtr'2 = r(N);

     *drop obs without salary information (or salary=0);

     keep if sal>0 & sal~=.;

     qui sum compdata;
     local cut_`yr'`qtr'3 = r(N);

     if `yr'<1997 | (`yr'==1997 & `month'==3)
     {;
      keep if mod==10 | mod==13 | mod==17;
     };
     if `yr'>1997 | (`yr'==1997 & `month'>3)
     {;
      keep if (mod==10 | mod==13 | mod==17) & tipotr==1;
     };

     qui sum compdata;
     local cut_`yr'`qtr'4 = r(N);

     *calculate employment;

     *note: in the past, I have required at least 10 workers, but the
     *cutoff is arbitrary and it seems better to err on the side of
     *inluding more obs;

     sort registro;
     by registro: egen empl_imss = count(sal);

     *create firm size categories;

	 gen firmsize=.;
     replace firmsize=1 if empl_imss==1;
     replace firmsize=2 if inrange(empl_imss,2,5);
     replace firmsize=3 if inrange(empl_imss,6,10);
     replace firmsize=4 if inrange(empl_imss,11,15);
     replace firmsize=5 if inrange(empl_imss,16,50);
     replace firmsize=6 if inrange(empl_imss,51,100);
     replace firmsize=7 if inrange(empl_imss,101,250);
     replace firmsize=8 if empl_imss>250;

     drop if empl_imss==1;

     *mark only one ob per individual per year as main job;

     *note: if >1 modalidad, take modalidad with lowest number. If >1
     *ob within modalidad, take job with highest salary;

     gsort nss mod -sal registro;
     by nss: gen main_job=1 if _n==1;

     qui sum main_job;
     local cut_`yr'`qtr'5 = r(N);


     ************** make additional variables **********************;

     *generate age variable;
     *some individuals have peculiar birth years or age<0;

     gen byear = real(substr(nss,5,2));
     replace byear=byear+1900;
     gen byte age = year - byear if year>=byear;
     drop byear;
     replace age=. if inlist(substr(nss,1,2), "77", "79", "80", "87", "97", "99");

     qui sum age if main_job==1;
     local cut_`yr'`qtr'6 = r(N);

     *generate registration year variable;
     *some registration years are after year, set these to missing;

     gen regyear = real(substr(nss,3,2)) + 1900 if real(substr(nss,3,2));
     if `yr'>=2000
     {;
      replace regyear = real(substr(nss,3,2)) + 2000
      if real(substr(nss,3,2))<=`yr'-2000;
     };
     replace regyear=. if regyear>`yr';

     *merge in minimum wage zones, then minimum wages;

     **merge imss directory first;
     sort registro;
     merge registro using `d1', sort uniqusing;
     tab _merge;
     keep if _merge==1 | _merge==3; * drop registros only in
      directory (no individual data);

     *some registros do not appear in directory so will not have
     muni or frac;
     *some registros in directory do not have muni or frac;
     *drop these registros now;

     drop if mpio_imss=="" | frac=="";

     drop _merge;

     *merge in minimum wages;

     sort year qtr zona;

     *some registros do not have matching min wage zones because
     *they dont appear in the directory;

     merge year qtr zona using `d2', sort uniqusing;
     tab _merge;
     keep if _merge==1 | _merge==3; *drop obs only in minwages;
     drop _merge;

     sort mpio_imss;

     * merge in IMSS-INEGI geographic concordance;
     merge mpio_imss using `d3', sort uniqusing keep(entmpio_inegi);

     *_merge==1 for municipalities (00,B00,Z03,Z04,Z05,Z06,Z07,Z31)
     that appear in firm directory but not the list of geographic codes;

     *Z06 has many firms and employees but doesnt match geo codes;


      replace entmpio_inegi="15123" if mpio_imss=="Z03";
     replace entmpio_inegi="15124" if mpio_imss=="Z04";
     replace entmpio_inegi="32057" if mpio_imss=="Z05";
     replace entmpio_inegi="11020" if mpio_imss=="Z06";
     *replace metro_area_eneu=5 if mpio_imss=="Z06"; *not assinging metro areas here anymore;
     replace entmpio_inegi="30211" if mpio_imss=="Z07";
     replace entmpio_inegi="02004" if mpio_imss=="Z21";
     *replace metro_area_eneu=21 if mpio_imss=="Z21"; *not assinging metro areas here anymore;

     *_merge==2 for municipality codes that do not contain any firms;
     *will want to confirm this to ensure no problem with merge;
     tab _merge;
     tab mpio_imss if _merge==1, missing;
     drop if _merge==2;
     drop _merge;

     qui sum compdata if main_job==1 & age~=.;
     local cut_`yr'`qtr'7 = r(N);

     ****************** make wage variables ********************;

     *generate real wage variable;
     gen rsal1 = sal/(cpi/100);

     * use top- and bottom-codes for each year -- call new variable rsal2;

     gen rsal2=rsal1;
     replace rsal2 = salmin/(cpi/100) if rsal1<salmin/(cpi/100);
     replace rsal2 = topsal/(cpi/100) if rsal1>topsal/(cpi/100);

     * use zone-specific 1991 bottom-code by zone and top-codes for entire
     period -- call new variable rsal3;

     gen rsal3=rsal1;
     replace rsal3 = rsalmin91_byzone if rsal1<rsalmin91_byzone;
     replace rsal3 = rtopsalmin_byzone if rsal1>rtopsalmin_byzone;

     * use global (i.e. not by zone) 1991 bottom-code and top-codes for entire
     period -- call new variable rsal4;

     gen rsal4=rsal1;
     replace rsal4 = rsalmin91_zonea if rsal1<rsalmin91_zonea;
     replace rsal4 = rtopsalmin_zonec if rsal1>rtopsalmin_zonec;

     *winsorize at 5/95 level within year -- call new variable rsal5;

     gen rsal5=rsal1;
     egen rsal5_p95 = pctile(rsal1), p(95);
     egen rsal5_p5 = pctile(rsal1), p(5);
     replace rsal5 = rsal5_p95 if rsal1>rsal5_p95 & rsal1~=.;
     replace rsal5 = rsal5_p5 if rsal1<rsal5_p5;
     drop rsal5_p95 rsal5_p5;

     *winsorize at 10/90 level within year -- call new variable rsal6;
     gen rsal6=rsal1;
     egen rsal6_p90 = pctile(rsal1), p(90);
     egen rsal6_p10 = pctile(rsal1), p(10);
     replace rsal6 = rsal6_p90 if rsal1>rsal6_p90 & rsal1~=.;
     replace rsal6 = rsal6_p10 if rsal1<rsal6_p10;
     drop rsal6_p90 rsal6_p10;

     ************************ select industries **********************;

     ** generate rama variable;
     gen rama_imss = substr(frac,1,2);

    *** save separate dataset with workers in social security;

    preserve;
    keep if frac=="9403";
    gen byte ind=real(substr(frac,1,1));
    keep year qtr nss sal rsal* age male registro mpio_imss entmpio_inegi
      firmsize frac main_job salmin mod salmindf empl_imss cpi rama_imss ind;
    save ${work}indlevelpubsec`yr'`month', replace;
    restore;

     *** continue preparation of main dataset;
     keep if inrange(rama_imss,"20","30") | inrange(rama_imss,"32","39") |
              inrange(rama_imss,"41","42") | inrange(rama_imss,"61","69");

     qui sum compdata if main_job==1 & age~=.;
     local cut_`yr'`qtr'8 = r(N);

     *create one-digit industry that matches one-digit industry in household data;

     gen byte ind=real(substr(frac,1,1));
     replace ind=2 if ind==3;

     tempfile d4;
     save `d4', replace;

     drop rsalmin91*;

     keep year qtr nss sal rsal* age male registro mpio_imss entmpio_inegi
      firmsize frac main_job salmin mod salmindf empl_imss cpi rama_imss ind;

     compress;

     save ${work}indlevel`yr'`month', replace;

     ***************** make plant level datasets **********************;

     use `d4', clear;

     * do not include social security employees when making plant-level dataset;
     drop if frac=="9403";

     *calculate average wage at each registro;

     sort registro;
     by registro: egen avg_wage = mean(sal);
     drop sal rsal1 rsal2 rsal3 rsal4 rsal5 rsal6;
     rename avg_wage sal;

     *keep only one observation per firm and only necessary variables;
     by registro: keep if _n==1;

     *generate real wage variable;
     gen rsal1 = sal/(cpi/100);

     * use top- and bottom-codes for each year -- call new variable rsal2;

     gen rsal2=rsal1;
     replace rsal2 = salmin/(cpi/100) if rsal1<salmin/(cpi/100);
     replace rsal2 = topsal/(cpi/100) if rsal1>topsal/(cpi/100);

     * use zone-specific 1991 bottom-code by zone and top-codes for entire
     period -- call new variable rsal3;

     gen rsal3=rsal1;
     replace rsal3 = rsalmin91_byzone if rsal1<rsalmin91_byzone;
     replace rsal3 = rtopsalmin_byzone if rsal1>rtopsalmin_byzone;

     * use global (i.e. not by zone) 1991 bottom-code and top-codes for entire
     period -- call new variable rsal4;

     gen rsal4=rsal1;
     replace rsal4 = rsalmin91_zonea if rsal1<rsalmin91_zonea;
     replace rsal4 = rtopsalmin_zonec if rsal1>rtopsalmin_zonec;

     *winsorize at 5/95 level within year -- call new variable rsal5;

     gen rsal5=rsal1;
     egen rsal5_p95 = pctile(rsal1), p(95);
     egen rsal5_p5 = pctile(rsal1), p(5);
     replace rsal5 = rsal5_p95 if rsal1>rsal5_p95;
     replace rsal5 = rsal5_p5 if rsal1<rsal5_p5;
     drop rsal5_p95 rsal5_p5;

     *winsorize at 10/90 level within year -- call new variable rsal6;
     gen rsal6=rsal1;
     egen rsal6_p90 = pctile(rsal1), p(90);
     egen rsal6_p10 = pctile(rsal1), p(10);
     replace rsal6 = rsal6_p90 if rsal1>rsal6_p90;
     replace rsal6 = rsal6_p10 if rsal1<rsal6_p10;
     drop rsal6_p90 rsal6_p10;

     keep year qtr sal rsal* registro mpio_imss frac salmin entmpio_inegi
      salmindf cpi empl_imss rama_imss firmsize ind;

     gen byte mes=`month';

     compress;

     save ${tmp}registros`yr'`month', replace;


     ************** cleaning statistics ******************************;

     *** keep statistics on cleaning procedure;
     clear;
     set obs 1;
     gen year = `yr';
     gen month = `month';
     gen nobs_initial = `cut_`yr'`qtr'1';
     gen nobs_9999cut = `cut_`yr'`qtr'2';
     gen nobs_misssalcut = `cut_`yr'`qtr'3';
     gen nobs_modcut = `cut_`yr'`qtr'4';
     gen nobs_mainjobcut = `cut_`yr'`qtr'5';
     gen nobs_agecut = `cut_`yr'`qtr'6';
     gen nobs_directorycut = `cut_`yr'`qtr'7';
     gen nobs_indcut = `cut_`yr'`qtr'8';


     save ${tmp}cleaningstats`yr'`month', replace;

     * display numbers of observations after each cleaning step;
     di;
     di "************* numbers of observations after each step *************************";
     di;
     di "year: `yr', month: `month'";
     di "number of obs in initial compdata file: `cut_`yr'`qtr'1'";
     di "number of obs after dropping registros containing 9999: `cut_`yr'`qtr'2'";
     di "number of obs after dropping obs without salary information: `cut_`yr'`qtr'3'";
     di "number of obs in included mods: `cut_`yr'`qtr'4'";
     di "number of obs in main job: `cut_`yr'`qtr'5'";
     di "number of obs with valid ages: `cut_`yr'`qtr'6'";
     di "number of obs in IMSS firm directory: `cut_`yr'`qtr'7'";
     di "number of obs in included industries: `cut_`yr'`qtr'8'";
     di;
     di "********************************************************************************";
     di;

   }; *end of loop over qtrs;
}; *end of loop over yrs;


***************************************************;
************* stack plant datasets ****************;
***************************************************;

clear;
set mem 5g;
set obs 1;
gen year=.;

forval yr=1985/2005
{;
 di "Year=`yr'";
 forval mes=3(3)12
 {;
  append using ${tmp}registros`yr'`mes';
 };
 compress;
};

save ${work}imss_plant, replace;


***************************************************;
********** stack cleaning datasets ****************;
***************************************************;

clear;
set mem 5g;

set obs 0;
gen year=.;

forval yr=1985/2005
{;
 di "Year=`yr'";
 forval mes=3(3)12
 {;
  append using ${tmp}cleaningstats`yr'`mes';
 };
 compress;
};

save ${work}cleaningimssstats, replace;

log close;
